Script to visualize the results from kappa_loss_analysis.R.

Data

# Load data
results <- read.csv("results/threshold_results.csv", stringsAsFactors = FALSE)
klc_data <- readRDS("results/KLC_plot_deciles.rds")

# Clean up data
results$critical_percentage <- as.numeric(as.character(results$critical_percentage))
results$kappa_at_critical <- as.numeric(as.character(results$kappa_at_critical))
klc_data <- klc_data %>%
  filter(noise %in% c(10, 20, 30)) %>%
  select(-c(accuracy, kappa, dataset_order, method_order))

Summary Statistics

# Display the raw data from CSV
kable(head(results, 20), caption = "Raw Data from threshold_results.csv (First 20 Rows)")
Raw Data from threshold_results.csv (First 20 Rows)
dataset_name technique noise_level threshold critical_percentage kappa_at_critical
analcatdata_authorship C5.0 10 0.05 100 0.00
badges2 C5.0 10 0.05 90 0.05
banknote C5.0 10 0.05 100 0.00
blood-transfusion-service-center C5.0 10 0.05 70 0.05
breast-w C5.0 10 0.05 100 0.01
cardiotocography C5.0 10 0.05 10 0.05
climate-model-simulation-crashes C5.0 10 0.05 100 0.05
cmc C5.0 10 0.05 40 0.05
credit-g C5.0 10 0.05 100 0.04
diabetes C5.0 10 0.05 100 0.03
eucalyptus C5.0 10 0.05 70 0.04
iris C5.0 10 0.05 100 0.02
kc1 C5.0 10 0.05 70 0.05
liver-disorders C5.0 10 0.05 20 0.05
mfeat-factors C5.0 10 0.05 100 0.00
mfeat-karhunen C5.0 10 0.05 100 0.01
mfeat-zernike C5.0 10 0.05 100 0.02
ozone-level-8hr C5.0 10 0.05 100 0.01
pc4 C5.0 10 0.05 30 0.04
phoneme C5.0 10 0.05 70 0.05
# Create interactive table of the full dataset
datatable(results,
          options = list(
            pageLength = 10,
            scrollX = TRUE,
            dom = 'Bfrtip',
            buttons = c('copy', 'csv', 'excel')
          ),
          extensions = 'Buttons',
          caption = "Complete Raw Data from threshold_results.csv")
# Summary statistics by threshold level
threshold_summary <- results %>%
  group_by(threshold) %>%
  summarise(
    mean_critical = mean(critical_percentage, na.rm = TRUE),
    median_critical = median(critical_percentage, na.rm = TRUE),
  )

kable(threshold_summary, caption = "Summary of Critical Percentages by Threshold Level")
Summary of Critical Percentages by Threshold Level
threshold mean_critical median_critical
0.05 68.76679 80
0.10 84.16361 100
0.15 90.68987 100
# Summary by technique
technique_summary <- results %>%
  group_by(technique) %>%
  summarise(
    mean_critical = mean(critical_percentage, na.rm = TRUE),
    median_critical = median(critical_percentage, na.rm = TRUE),
  ) %>%
  arrange(median_critical)

kable(technique_summary, caption = "Summary of Critical Percentages by ML Technique")
Summary of Critical Percentages by ML Technique
technique mean_critical median_critical
rbfDDA 60.81197 80
C5.0 79.10256 100
JRip 81.79487 100
PART 80.17094 100
bayesglm 81.53846 100
ctree 81.45299 100
fda 76.96581 100
gbm 84.23077 100
gcvEarth 75.98291 100
knn 88.07692 100
lvq 80.89744 100
mlpML 94.61538 100
multinom 71.83761 100
naive_bayes 85.64103 100
rda 73.50427 100
rf 84.10256 100
rfRules 87.35043 100
rpart 80.59829 100
simpls 86.28205 100
svmLinear 84.52991 100
svmRadial 85.85470 100

Visualization of Critical Percentages

Kappa Loss Curves Visualization

# Get unique dataset names and technique names
datasets <- unique(klc_data$dataset_name)
method_names <- unique(klc_data$technique)

# Create a new column to control the order of datasets
klc_data$dataset_order <- factor(klc_data$dataset_name, levels = datasets)

# Create a new column to control the order of methods
klc_data$method_order <- factor(klc_data$technique, levels = method_names)

# Create custom labels for methods (a-u) and datasets (1-26)
method_labels <- letters[1:length(method_names)]
names(method_labels) <- method_names
dataset_labels <- as.character(1:length(datasets))
names(dataset_labels) <- datasets

# Create plot
p <- ggplot(klc_data, aes(x = percentage, y = kappa_loss, color = factor(noise))) +
  geom_point(size = 0.8, alpha = 0.6) +
  geom_line(aes(group = factor(noise)), linewidth = 0.8) +
  labs(title = "Kappa Loss Curves by Dataset, Technique, and Noise Level",
       x = "Percentage of Instances", 
       y = "Kappa Loss", 
       color = "Noise Level") +
  theme_bw() +
  scale_y_continuous(limits = c(0.0, 1), breaks = seq(0, 1, by = 0.2)) +
  facet_grid(method_order ~ dataset_order, scales = "free", 
             labeller = labeller(method_order = method_labels, dataset_order = dataset_labels)) +
  theme(strip.text = element_text(size = 7),
        axis.text = element_text(size = 6),
        legend.position = "bottom")

# Print plot
print(p)

Distribution of Critical Percentages

ggplot(results %>% filter(!is.na(critical_percentage)), 
       aes(x = critical_percentage, fill = as.factor(threshold))) +
  geom_density(alpha = 0.7) +
  labs(title = "Distribution of Critical Percentages by Threshold Level",
       x = "Critical Percentage of Instances",
       y = "Density",
       fill = "Threshold") +
  theme_minimal() +
  scale_fill_manual(values = c("0.05" = "#94D2BD", "0.1" = "#0A9396", "0.15" = "#005F73")) +
  theme(legend.position = "bottom")

Kappa Loss Analysis

Average Kappa Loss by Threshold

# Calculate average kappa loss by threshold
kappa_loss_by_threshold <- results %>%
  filter(!is.na(kappa_at_critical)) %>%
  group_by(threshold) %>%
  summarise(
    mean_kappa_loss = mean(kappa_at_critical, na.rm = TRUE),
    median_kappa_loss = median(kappa_at_critical, na.rm = TRUE),
  )

# Display summary table
kable(kappa_loss_by_threshold, 
      caption = "Summary of Kappa Loss by Threshold Level", 
      digits = 3)
Summary of Kappa Loss by Threshold Level
threshold mean_kappa_loss median_kappa_loss
0.05 0.029 0.04
0.10 0.051 0.05
0.15 0.064 0.06

Kappa Loss by Machine Learning Technique and Threshold

# Calculate average kappa loss by technique and threshold
kappa_loss_by_technique <- results %>%
  filter(!is.na(kappa_at_critical)) %>%
  group_by(technique, threshold) %>%
  summarise(
    mean_kappa_loss = mean(kappa_at_critical, na.rm = TRUE),
    n_values = sum(!is.na(kappa_at_critical))
  ) %>%
  ungroup()

# Create a heatmap of kappa loss by technique and threshold
ggplot(kappa_loss_by_technique, aes(x = as.factor(threshold), y = reorder(technique, -mean_kappa_loss), fill = mean_kappa_loss)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Average Kappa Loss by ML Technique and Threshold",
       x = "Threshold",
       y = "Machine Learning Technique",
       fill = "Mean Kappa Loss") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8)) +
  geom_text(aes(label = sprintf("%.2f", mean_kappa_loss)), size = 3)

Dataset and Technique Information Table

# Create dataset information summary
dataset_info <- results %>%
  filter(!is.na(critical_percentage)) %>%
  group_by(dataset_name, technique, noise_level) %>%
  summarise(
    max_instances = max(critical_percentage, na.rm = TRUE),
    mean_kappa_loss = mean(kappa_at_critical, na.rm = TRUE),
    n_thresholds = n()
  ) %>%
  ungroup()

# Save to a dataframe
dataset_info_df <- as.data.frame(dataset_info)

# Save the complete dataset info to an RDS file
saveRDS(dataset_info_df, "results/dataset_technique_info.rds")

# Create interactive table
datatable(dataset_info_df,
          options = list(
            pageLength = 10,
            scrollX = TRUE,
            dom = 'Bfrtip',
            buttons = c('copy', 'csv', 'excel')
          ),
          extensions = 'Buttons',
          caption = "Complete Dataset, Technique, and Instance Information")

Best Performing Techniques

Higher critical percentage is better (can alter more instances while maintaining good performance)

# Find best performing techniques
best_techniques <- results %>%
  filter(!is.na(critical_percentage)) %>%
  group_by(threshold, technique) %>%
  summarise(mean_critical = mean(critical_percentage, na.rm = TRUE)) %>%
  arrange(threshold, desc(mean_critical)) %>%  # Sort in descending order to get highest values first
  group_by(threshold) %>%
  slice_head(n = 5)

kable(best_techniques, caption = "Top 5 Techniques with Highest Mean Critical Percentages by Threshold")
Top 5 Techniques with Highest Mean Critical Percentages by Threshold
threshold technique mean_critical
0.05 mlpML 89.48718
0.05 svmRadial 77.43590
0.05 simpls 76.66667
0.05 knn 76.53846
0.05 naive_bayes 75.38462
0.10 mlpML 95.89744
0.10 knn 91.66667
0.10 rfRules 91.15385
0.10 naive_bayes 88.46154
0.10 rf 88.46154
0.15 mlpML 98.46154
0.15 knn 96.02564
0.15 rfRules 95.89744
0.15 gbm 94.35897
0.15 JRip 93.84615